import os, sys, gc
import base64, pickle
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True) #prevent numpy exponential
#notation on print
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# Add higher directory to python modules path =>
sys.path.append(os.path.realpath("..\\..\\"))
%%html
<style type='text/css'>
.CodeMirror{
font-size: 8.5pt;
}</style>
In this notebook here, we employ random search techniques to look for optimized values for the hyperparameters of the French NLP model developped in this parent notebook. Random search consists in randomly assigning values to the hyperparameters we try to tune for model performance. For our French Reviews NLP model, we consider the RMSE as our performance metric of choice.
The standard random search methodology will in a second time be much enhanced by relying on an XGBoost regressor to dig deeper and stumble on sets of NLP hyperparameter values that allow for the french NLP model to deliver even incomparably better performance.
We, here, during the optimization loop, use a "light" version of the vocabulary which is made up of "only" the words we encounter at least once in our training Dataset. We do that simply to lighten the footprint of our models in memory (@see this notebook for a grasp on how we did it).
Note that, for production, we would not do that since benefiting from a large vocabulary allows for our final model to "generalize" to words it had never seen during training when generating future predictions.
from my_NLP_RNN_fr_lib.fastText import read_fastText_vecs
word_to_index, index_to_word, word_to_vec_map = \
read_fastText_vecs( os.path.join(os.path.realpath("..\..")
, 'data', 'fastText_french', 'cc.fr.300.light.vec') )
Adding the "<UKN>" ("unknown") token to the vocabulary for any misspelled words from the training dataset :
avg_vec = np.zeros( word_to_vec_map[ list(word_to_vec_map.keys())[0] ].shape )
for k, v in word_to_vec_map.items() :
avg_vec += v
avg_vec /= len( word_to_vec_map )
w = "<UNK>"
word_to_vec_map[ w ] = avg_vec ; del avg_vec
word_to_index[ w ] = len( word_to_index )+1
index_to_word[ len( word_to_index ) ] = w
from my_NLP_RNN_fr_lib.tweet_utils import \
load_review_fr, clean_tweet_fr, TWEET_MAX_CHAR_COUNT
from my_NLP_RNN_fr_lib.tokenizer_utils import tokenize_tweet_fr
# loading
reviews = load_review_fr()
# preprocessing
reviews = clean_tweet_fr(reviews, col_name = 'comment')
# tokenization (as an intermediary step)
tokenized_reviews = tokenize_tweet_fr(reviews, col_name='comment')
# filtering-out lengthy elements (if any)
reviews_length_data = \
tokenized_reviews.map( lambda sentence_tokens_list: len( sentence_tokens_list ) )
reviews.drop(
reviews_length_data[
(reviews_length_data > TWEET_MAX_CHAR_COUNT) | (reviews_length_data == 0)
].index
, inplace=True )
reviews.reset_index(drop=True, inplace=True)
del tokenized_reviews, reviews_length_data ; dummy = gc.collect()
During the optimization phase, it is common practice to train model instances on small subsets of the training data. This allows for much faster iterating, while gaining valuable insights into which sets of hyperparameter values are the most promissing when latter applied to the whole dataset.
We will identify the most promissing model (the most promissing sets of NLP hyperparameter values) based on the model performance. And in order to compare apples to apples and not apples to oranges, we need to ensure that the models performances are measured against the same datasets. For that purpose, we employ seeding.
retaining only 1/20th of the data (in a reproduceable manner), for training speedup during the optimization loop =>
# sub-sampling the French Reviews dataset for faster optimization experimenting =>
import random
# set a fixed seed value so as to be able to compare on common grounds
# trained_models with different hyperparameters values
seed_state = np.random.get_state() ; np.random.seed(1234) ; random.seed(1234)
rows = random.sample(list(reviews.index), int(reviews.shape[0]/20))
reviews_20th = reviews.iloc[rows]
print("Entire French Reviews dataset : " + "{:,}".format(reviews.shape[0]) + " records - " +
"Retained subset : " + "{:,}".format(reviews_20th.shape[0]) + " records")
reviews_20th.reset_index(drop=True, inplace=True)
# turning lists of word-tokens into lists of embedding matrix row indices =>
from my_NLP_RNN_fr_lib.fastText import sentences_to_indices
reviews_X = \
sentences_to_indices(reviews_20th['comment'], word_to_index, max_len = TWEET_MAX_CHAR_COUNT)
reviews_Y = \
reviews_20th['rating']
#del reviews ; gc.collect()
# extracting a validation sub-subset =>
from sklearn.model_selection import train_test_split
test_prop = .15
X_train, X_valid, y_train, y_valid = \
train_test_split(reviews_X, reviews_Y,
test_size=test_prop)
# restore a "random" seed state
# (such as the one we were in prior to fixing it)
np.random.set_state(seed_state) ; np.random.seed(None) ; random.seed(None)
# data sanity check ; ensure no "rating" is malformed
assert (not reviews['rating'].isnull().any() and
np.issubdtype(reviews['rating'].dtype, np.number)) \
, 'we have a "rating is malformed" issue with the training data !'
During hyperparameters random search, we train several instances of an NLP model. An effective way to track progress during that phase is thru Tensorboard. Tensorbard plots in (near) real time the different model metrics of our choosing. We will be tracking RMSE and loss measures on training and validation subsets.
Monitoring models training :
from my_NLP_RNN_fr_lib.jupyter_tensorboard_windows_launcher import JupyterTensorboardWindows
log_dir = os.path.join(os.path.realpath("..\.."), "logs", "fit")
tb_launcher = JupyterTensorboardWindows( log_dir )
tb_launcher.run()
#On Windows, also view TensorBoard thru "http://localhost:6006/"
%reload_ext tensorboard.notebook
# embed a Tensorboard HTML page in an iframe in the below jupyter notebook cell (will only show when running)
%tensorboard --logdir {log_dir.replace("\\", "\\\\")}
from my_NLP_RNN_fr_lib.display_helper import format_vertical_headers
from my_NLP_RNN_fr_lib.model.hyperparameters import get_new_hyperparameters
We perform hyperparameter optimization thru random value search using logaritmic or truncated normal distribution as best fits. For details on how this has been implmented, refer to the my_NLP_RNN_fr_lib.model.hyperparameters module
. It has been designed for the NLP model architecture we employ.
Below is an example of {{examples_count = 2 ; examples_count}} such randomly-generated sets of NLP hyperparameters values :
hyperparameters = get_new_hyperparameters(models_count = examples_count, verbose = 0)
hyperparameters.insert(loc=0, column='is_random_set', value=[1]*hyperparameters.shape[0])
format_vertical_headers(
hyperparameters.drop(['is_random_set'], axis = 1)
.reset_index(level=0).rename(columns = {'index':'model instance'})
)
from my_NLP_RNN_fr_lib.model.training import train_models
from my_NLP_RNN_fr_lib.model.plot import plot_trained_models_history
The routine in charge of the training of the model instances has been designed so that the training can take place sequentially, in waves.
Each wave can consist of any number of model instances and, all the results are maintained in a global csv file listing hyperparameter values and resulting model performance measures. All the model instances training history and model weights are also stored locally for latter re-use if needed (for instance for purposes such as cross-instance analyses).
Below, we can see the trace of the training for our wave of
NameError: name 'examples_count' is not defined
above-generated sets of hyperparameter values :csv_full_path = os.path.join(os.path.realpath(".."), 'trained_instances.csv')
save_root = os.path.join(os.path.realpath(".."), 'trained_instances')
batch_size = 512 ; epochs = 120
train_models(hyperparameters
, (X_train, X_valid, y_train, y_valid)
, word_to_vec_map, word_to_index
, csv_full_path, save_root
, batch_size = batch_size, epochs = epochs
, verbose = 1
)

trained_models_df = pd.read_csv(csv_full_path)
# keep only models trained on dataset of same size (the same dataset, since we seeded it)
trained_models_df = \
trained_models_df[(trained_models_df[ "training records" ] == X_train.shape[0]) &
(trained_models_df[ "validation records" ] == X_valid.shape[0]) &
(trained_models_df[ "batch_size" ] == batch_size) &
(trained_models_df[ "epochs" ] == epochs)
]
trained_models_df = \
trained_models_df.sort_values(by=["val_rmse__last_10_avg"]
, ascending=True)
trained_models_df.insert(0, 'csv_idx', trained_models_df.index+1)
trained_models_df.reset_index(drop=True, inplace=True)
trained_models_df.index+=1
trained_models_rndm_df = \
trained_models_df[(trained_models_df[ "is_random_set" ] == True)].reset_index(drop=True)
trained_models_rndm_df.index+=1
#print(str(trained_models_rnd_df.shape[0]) + " trained models in total")
plot_trained_models_history(
trained_models_rndm_df.head(best_models_count), validation_only = True)
import matplotlib.colors as colors
cmap = colors.LinearSegmentedColormap.from_list("gyr", ["green","yellow","red"])
format_vertical_headers(
trained_models_rndm_df.head(best_models_count) \
.drop(['timestamp', 'is_random_set'
, 'training records', 'validation records'
, 'best_training_step', 'best_val_rmse', 'val_loss'
, 'batch_size', 'epochs', 'local_path'], axis=1)
.reset_index(level=0).rename(columns = {'index':'trained model rank'}) \
.style.background_gradient(cmap=cmap, subset=['val_rmse__last_10_avg']) \
.background_gradient(cmap=cmap, subset=['val_loss__last_10_avg'])
, greyed_out_colnames = [ 'trained model rank', 'csv_idx', 'val_rmse__last_10_avg', 'val_loss__last_10_avg'
, 'train_rmse__last_10_avg', 'train_loss__last_10_avg' ]
)
from my_NLP_RNN_fr_lib.model.plot import hyperparameters_univariate_rmse_plot
hyperparameters_univariate_rmse_fig = hyperparameters_univariate_rmse_plot(
trained_models_rndm_df, col_name = 'val_rmse__last_10_avg'
# to preserve plots scaling, exclude \"extreme outliers\" from plots =>
, outliers_threshold = 1.2
)
from my_NLP_RNN_fr_lib.display_helper import two_columns_display
# correlation matrix
performance_correlation_matrix = \
trained_models_rndm_df.drop(['timestamp', 'csv_idx', 'is_random_set', 'training records', 'validation records'
, 'best_training_step', 'best_val_rmse', 'val_loss'
, 'val_loss__last_10_avg', 'train_rmse__last_10_avg', 'train_loss__last_10_avg'
, 'batch_size', 'epochs', 'local_path'], axis=1).corr()
# only keepling half
arr = np.triu(performance_correlation_matrix, k=0) ; arr[arr == 0] = np.nan
# plotting
fontsize = 13 ; plt.ioff() ; fig = plt.figure(figsize=(4, 5)) ; ax = plt.gca()
htmp = ax.matshow(arr.astype('float'))
ax.set_xticks(range(performance_correlation_matrix.shape[1]), minor=False)
ax.set_xticklabels(performance_correlation_matrix.columns, fontsize=fontsize, rotation=45, ha="left")
ax.set_yticks(range(performance_correlation_matrix.shape[1]), minor=False)
ax.set_yticklabels(performance_correlation_matrix.columns, fontsize=fontsize)
cb = fig.colorbar(htmp, orientation='horizontal') ; cb.ax.tick_params(labelsize=fontsize)
ax.text(.5,.05,'Correlation Matrix', horizontalalignment='center', transform=ax.transAxes, fontsize=14)
#plt.show()
if not os.path.isdir(os.path.join('..', '..', 'tmp')) : os.makedirs(os.path.join('..', '..', 'tmp'))
title = "hyperparameters_correlation"
fig_url = os.path.join('..', '..', 'tmp', title+".jpg")
plt.savefig(fig_url, bbox_inches='tight') ; plt.close(fig)
# use base64 encoding to circumvent image (external file) caching (webbrowser)
with open(fig_url, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
encoded_string = "data:image/jpeg;charset=utf-8;base64, " + encoded_string.decode()
os.remove(fig_url)
# absolute_performance_correlation
absolute_performance_correlation = \
pd.DataFrame(abs(performance_correlation_matrix['val_rmse__last_10_avg']).iloc[:-1].sort_values(ascending=False)
).rename(columns={'val_rmse__last_10_avg': 'absolute performance correlation coeff.'})
hyperparameter = absolute_performance_correlation.iloc[0].name
two_columns_display('<img src="' + encoded_string + '" />', absolute_performance_correlation.to_html(), .55)
from my_NLP_RNN_fr_lib.model.plot import hyperparameter_rmse_boxplot
col_name = 'val_rmse__last_10_avg'
plt.ioff() ; fig, ax = plt.subplots(figsize=(6, 4))
bins_edges, grouped_medians = \
hyperparameter_rmse_boxplot(trained_models_rndm_df
, hyperparameter, ax
, bins_count=10
, bin_edge_format="{0:0.0f}" # "{0:.2f}" # "{0:.0%}" #
, xlabels_ha='center'
, col_name = col_name
)
fig.suptitle('Distribution of Performance among ' + str(trained_models_rndm_df.shape[0]) +
' Trained NLP Models')
fig.subplots_adjust(top=.93) # remove extra spacing underneath 'fig.suptitle'
#plt.show()
if not os.path.isdir(os.path.join('..', '..', 'tmp')) : os.makedirs(os.path.join('..', '..', 'tmp'))
title = "top_hyperparameter_boxplot"
fig_url = os.path.join('..', '..', 'tmp', title+".jpg")
plt.savefig(fig_url, bbox_inches='tight') ; plt.close(fig)
# use base64 encoding to circumvent image (external file) caching (webbrowser)
with open(fig_url, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
encoded_string = "data:image/jpeg;charset=utf-8;base64, " + encoded_string.decode()
os.remove(fig_url)
lowest_median_idx = grouped_medians[col_name + ' (bin median)'].idxmin()
#print("lowest_median_idx : " + str(lowest_median_idx))
two_columns_display(grouped_medians.to_html(), '<img src="' + encoded_string + '" />', .33)
from my_NLP_RNN_fr_lib.model.plot import stratified_hyperparameter_rmse_boxplot
child_hyperparameter = absolute_performance_correlation.iloc[1].name # 'spatial_dropout_prop' #
fig = stratified_hyperparameter_rmse_boxplot(
trained_models_rndm_df
, parent_hyperparameter = hyperparameter, parent_strats_count = 10, parent_strat_edge_format = "{0:0.0f}" # "{0:.2f}" #
, hyperparameter = child_hyperparameter, bins_count=15, bin_edge_format = "{0:0.0f}" # "{0:.3f}" #
, col_name = col_name
, ncols = 5, figsize=(15, 6)
)
# allow space for 'fig.suptitle'
fig.tight_layout(rect=[0., 0., 1, 0.97]) # rect : tuple (left, bottom, right, top)
fig.suptitle('Distribution of Performance among ' +
str(trained_models_rndm_df.shape[0]) + ' Trained NLP Models')
best_parent_strat_idx = lowest_median_idx - 1
best_strat_ax = fig.get_axes()[best_parent_strat_idx]
print("best_parent_strat : " + best_strat_ax.get_title())
best_strat_ax.set_title(
label = "- " + best_strat_ax.get_title() + " -"
, fontdict = {'fontsize':12, 'color':'blue', 'fontweight':'bold'}
, backgroundcolor='yellow'
)
for ax_spine in ['left', 'top', 'right', 'bottom'] :
best_strat_ax.spines[ax_spine].set_linewidth(2)
best_strat_ax.spines[ax_spine].set_edgecolor('blue')
best_strat_ax.patch.set_facecolor('yellow')
best_strat_ax.patch.set_alpha(0.1)
plt.show()
import xgboost as xgb, multiprocessing ; print("xgboost version " + str(xgb.__version__))
# XGBoost regressor training data
xgbR_X = trained_models_df.loc[:, ~trained_models_df.columns.isin([
'timestamp', 'csv_idx', 'training records', 'validation records', 'is_random_set'
, 'lr_decay_rate', 'lr_decay_step', 'batch_size', 'epochs', 'best_training_step', 'best_val_rmse', 'val_loss'
, 'val_rmse__last_10_avg', 'val_loss__last_10_avg', 'train_rmse__last_10_avg', 'train_loss__last_10_avg'
, 'local_path'])]
xgbR_y = trained_models_df['val_rmse__last_10_avg']
#print(str(xgbR_X.shape[0]) + " trained NLP models") # NOTE THAT WE NO LONGUER EXCLUDE XGBoost-IDENTIFIED SETS OF HYPERPARAMETERS (see "feedback loop" below)
from my_NLP_RNN_fr_lib.xgboost import xgbR_gridsearch_cv
# regressor
xgb_R = xgb.XGBRegressor(
n_estimators=25000
, max_depth=20
, num_parallel_tree=2
, learning_rate=0.05
, reg_alpha=0.5 # L1 regularization term on weights, default=0
, reg_lambda=0.5 # L2 regularization term on weights, default=1
, verbosity=0
, objective = 'reg:squarederror'
, tree_method = 'exact' # recommended for small datasets
#, tree_method = 'gpu_hist'
, n_jobs=1
)
# gridsearch domain
xgbR_params_grid = {
'learning_rate': [x*10**(-e) for e in range(3, 4) for x in [1, 5]],
#'max_depth': [8, 10, 15, 20],
'num_parallel_tree': [1, 4, 8],
'subsample': [.3, .5, .7, 1.],
'reg_alpha': [.001, .002, .005, .01],
'reg_lambda': [.1, .4, .9, 5, 10]
}
xgbR_gridCv = xgbR_gridsearch_cv(
xgb_R
, xgbR_X, xgbR_y
, params_grid = xgbR_params_grid
, cv_folds_count = 3, test_size = .3
, early_stopping_rounds = 2000
, refit=True
, n_jobs = multiprocessing.cpu_count()-1
, par_backend = 'loky' # 'multiprocessing' # 'threading' #
, verbose = 2
)
Fitting 3 folds for each of 128 candidate xgboost configurations, totalling 384 fits

from my_NLP_RNN_fr_lib.xgboost import xgbR_gridsearch_cv_plot
fig = xgbR_gridsearch_cv_plot(xgbR_gridCv, params_subplots = True, restrict_rank = gridsearch_cv_visualize_restrict_rank
, include_train_rmse = True)
plt.show()
from matplotlib import gridspec
import random
fig = plt.figure(figsize=(17, 6))
gs = gridspec.GridSpec(1, 2, width_ratios=[1, 3])
ax0 = plt.subplot(gs[0])
ax1 = plt.subplot(gs[1])
# training history
ax0.plot(xgbR_gridCv.best_estimator_.evals_result()['validation_0']['rmse'], label="training dataset")
ax0.plot(xgbR_gridCv.best_estimator_.evals_result()['validation_1']['rmse'], label="validation dataset")
ax0.title.set_text("XGBoost Regressor training history (RMSE measures)\n[whole dataset ; refitted `best` candidate]")
ax0.legend()
# sampled prediction
sample_size = 50
sample_idx = random.sample(range(len(xgbR_y)), sample_size)
xgbR_y_pred = xgbR_gridCv.best_estimator_.predict(xgbR_X.loc[sample_idx])
x_index = np.arange(len(sample_idx))
bar_width = 0.35
ax1.bar(x_index, xgbR_y.loc[sample_idx], bar_width, label="actual (validation dataset sample)", color='black', alpha=.4)
ax1.bar(x_index + bar_width, xgbR_y_pred, bar_width, label="XGBoost-predicted", color='y', alpha=.4)
ax1.set_ylim(max(ax1.get_ylim()[0], .5), min(1.2, ax1.get_ylim()[1]))
ax1.set_title("NLP Model performance (rmse)")
ax1.legend()
ax1.get_xaxis().set_visible(False)
plt.show()
feature_important = xgbR_gridCv.best_estimator_.get_booster().get_score(importance_type='weight')
features_keys = list(feature_important.keys())
features_values = list(feature_important.values())
data = pd.DataFrame(data=features_values, index=features_keys, columns=["score"]) \
.sort_values(by = "score", ascending=False)
fig = plt.figure(figsize=(14, 3)) ; ax = fig.gca()
y_index = np.arange(len(data)) ; width = .8
ax.barh(y_index, data.score, width)
ax.set(yticks=y_index, yticklabels=data.index, ylim=[2*(width-1), len(data)])
ax.set_title(label = 'NLP Model Architecture - Hyperparameter Importance\n' +
r'$\it{(per}$ $\it{XGBoost}$ $\it{classification)}$')
plt.show()
features_indices = np.argsort(features_values)
ordered_features = [features_keys[i] for i in features_indices]
trees_df = xgbR_gridCv.best_estimator_.get_booster().trees_to_dataframe()
tree_view_index = 999 # ordinal number of target tree to be visualized
from IPython.core.display import display, HTML
# high resolution tree plot
gv_img = xgb.to_graphviz(xgbR_gridCv.best_estimator_, num_trees=tree_view_index, rankdir='LR')
fig_url = gv_img.render(os.path.join('..', '..', 'tmp', 'xgboost_tree'), format='png', cleanup=True)
# use base64 encoding to circumvent image (external file) caching (webbrowser)
with open(fig_url, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
encoded_string = "data:image/jpeg;charset=utf-8;base64, " + encoded_string.decode()
display(HTML('<a href="' + fig_url + '" target="_blank"><img src="' + encoded_string + '" style="width: 100%;"/></a>' +
'<br /><center><em>click to enlarge</em></center>'))
tree_nodes_count_df = trees_df[(trees_df['Tree']==tree_view_index) & (trees_df['Feature']!='Leaf')] \
.groupby(['Feature'], as_index=False)['Node'].count()
backup
with open('xgbR_gridCv.pkl', 'wb') as output_file :
pickle.dump(xgbR_gridCv, output_file)
reload
with open('xgbR_gridCv.pkl', 'rb') as input_file:
xgbR_gridCv = pickle.load(input_file)
from my_NLP_RNN_fr_lib.model.supercharger import best_nlp_models_hyperparameters
xgbR_best_hyperparameters = best_nlp_models_hyperparameters(
xgb_regressor = xgbR_gridCv.best_estimator_
, best_models_count = 5
, batch_models_count = 1_000_000
, NLP_predicted_val_rmse_threshold = .63 # /* optional */
, show_batch_box_plots = False
)
with open(os.path.join('..', 'xgbR_hyperparameters_pickle.pkl'), 'wb') as output_file :
pickle.dump(xgbR_best_hyperparameters[:24].reset_index(drop=True)
, output_file)
from my_NLP_RNN_fr_lib.model.plot import hyperparameters_univariate_rmse_overlay_plot
xgboost_identified_nlp_models_fig = \
hyperparameters_univariate_rmse_overlay_plot(
hyperparameters_univariate_rmse_fig
, trained_models_df[trained_models_df[ "is_random_set" ] == False]
, col_name = 'val_rmse__last_10_avg'
)
format_vertical_headers(
trained_models_df.iloc[[
trained_models_df[(trained_models_df['lr_decay_rate']==.1) &
(trained_models_df['lr_decay_step']==45)
]['val_rmse__last_10_avg'].idxmin()
]].drop(['csv_idx', 'timestamp', 'is_random_set', 'training records', 'validation records'
, 'best_val_rmse', 'val_loss', 'best_training_step', 'local_path'], axis=1)
, greyed_out_colnames = ['lr_decay_rate', 'lr_decay_step', 'batch_size', 'epochs',
'val_rmse__last_10_avg', 'val_loss__last_10_avg'
, 'train_rmse__last_10_avg', 'train_loss__last_10_avg' ])
Export Notebook to HTML (with Markdown extension cells evaluated)
from my_NLP_RNN_fr_lib.jupyter_markdown_extension import md_extension_to_html
md_extension_to_html(os.path.join(os.path.realpath('.'), 'hyperparameters_values_search.ipynb'))
md_extension_to_html(os.path.realpath(os.path.join('..', '..', 'data', 'fastText_french', 'cc.fr.300.light.vec.ipynb')))
Manually re-train models using given sets of hyperparameters values :
tmp_df01 = pd.DataFrame(
[[1, 0.71, 90, 0.02722, 0.04377, 0.00183, 28, 3, 2, 2, 2, 0.12, 0.0278, 0.2, 49]]
)
tmp_df01.columns = hyperparameters.columns
tmp_df02 = pd.DataFrame(
[[1, 0.55, 110, 0.01564, 0.02171, 0.03221, 28, 5, 2, 31, 5, 0.45, 0.0235, 0.46, 17]]
)
tmp_df02.columns = hyperparameters.columns
hyperparameters = pd.concat([tmp_df01, tmp_df02], ignore_index=True)
format_vertical_headers(
hyperparameters
.reset_index(level=0).rename(columns = {'index':'trained model'})
)